我们将使用系统内安装的R:
# bash命令
source /sibcb/program/install/hdf5-1.10/profile
source /sibcb/program/install/r-4.4/profile加载R包:
pbmc <- readRDS('output/pbmc.RDS')
pbmc
An object of class Seurat
13714 features across 2638 samples within 1 assay
Active assay: RNA (13714 features, 2000 variable features)
3 layers present: counts, data, scale.data
2 dimensional reductions calculated: pca, umap
pbmc <- RunMCA(pbmc, nmcs = 50, reduction.name = "mca", slot = "data")
1.597 sec elapsed
100.112 sec elapsed
4.184 sec elapsed
pbmc
An object of class Seurat
13714 features across 2638 samples within 1 assay
Active assay: RNA (13714 features, 2000 variable features)
3 layers present: counts, data, scale.data
3 dimensional reductions calculated: pca, umap, mca
MCA分解能够同时将细胞和基因投射到同一个空间。比如,我们可以展示CD14 Monocytes的标识基因CD14,以及CD4 T cell的标识基因CD14:
DimPlotMC(pbmc, reduction = "mca", group.by = "cellType",
features = c("IL7R", "CD14"), as.text = TRUE) +
ggtitle("MCA with some key gene markers")
我们也可以回顾下先前分类的结果:
通过超几何分布的检验,返回Log10转化的adjust P-value.
gsl <- list(BSA = readLines('CelliD/B_cell_activation.txt'))
HGT <- RunCellHGT(pbmc, pathways = gsl, dims = 1:50, n.features = 200, log.trans = T, p.adjust = T)
展示活性的分布:
Bscore <- HGT[,1]
pbmc <- AddMetaData(pbmc, Bscore, col.name = 'B_cell_activation')
FeaturePlot(pbmc, features = c('B_cell_activation'))
label <- rep('others', length(pbmc$cellType))
label[pbmc$cellType == "B"] <- "B"
roc_empirical <- rocit(score = pbmc$B_cell_activation, class = label, negref = "others")
title <- paste0('AUC:', strsplit(as.character(summary(roc_empirical)[4,]), ':')[[1]][2])
Method used: empirical
Number of positive(s): 344
Number of negative(s): 2294
Area under curve: 0.9476
我们分析的第二个数据是来自于胰腺的scRNA-seq Baron et al. 2016.
data("HgProteinCodingGenes")
BaronMatrix <- readRDS('CelliD/BaronMatrix.rds')
BaronMetaData <- readRDS('CelliD/BaronMetaData.rds')
BaronMatrixProt <- BaronMatrix[rownames(BaronMatrix) %in% HgProteinCodingGenes,]
Baron <- CreateSeuratObject(counts = BaronMatrixProt, project = "Baron", min.cells = 5, meta.data = BaronMetaData)
Baron
An object of class Seurat
14804 features across 8569 samples within 1 assay
Active assay: RNA (14804 features, 0 variable features)
1 layer present: counts
Baron <- NormalizeData(Baron)
Baron <- ScaleData(Baron, features = rownames(Baron))
Baron <- RunMCA(Baron)
6.339 sec elapsed
109.752 sec elapsed
14.128 sec elapsed
DimPlotMC(Baron, reduction = "mca", group.by = "cell.type",
features = c("CTRL", "INS", "MYZAP", "CDH11"), as.text = TRUE) +
ggtitle("MCA with some key gene markers")
Baron <- RunPCA(Baron, features = rownames(Baron))
Baron <- RunUMAP(Baron, dims = 1:30)
Baron <- RunTSNE(Baron, dims = 1:30)
Baron <- RunTSNE(Baron, dims = 1:30)
tSNE <- DimPlot(Baron, reduction = "tsne", group.by = "cell.type")+ ggtitle("tSNE") +
theme(legend.text = element_text(size =10), aspect.ratio = 1)
UMAP <- DimPlot(Baron, reduction = "umap", group.by = "cell.type") + ggtitle("UMAP") +
theme(legend.text = element_text(size =10), aspect.ratio = 1)
ggarrange(tSNE, UMAP, common.legend = TRUE, legend = "top")
读取细胞marker数据文件:
chunks <- strsplit(readLines('CelliD/PanglaoDB_markers_27_Mar_2020.tsv.gz'), '\t')
header <- chunks[[1]]
chunks <- chunks[-1]
df <- as.data.frame(do.call(rbind, chunks), stringsAsFactors = FALSE)
colnames(df) <- gsub(' ', '_', header)
df <- df %>% filter(organ == "Pancreas", str_detect(species, "Hs"))
head(df)
species official_gene_symbol cell_type nicknames
1 Mm Hs CTRB1 Acinar cells CTRB
2 Mm Hs KLK1 Acinar cells Klk6
3 Mm Hs RBPJL Acinar cells RBP-L|SUHL|RBPSUHL
4 Mm Hs PTF1A Acinar cells PTF1-p48|bHLHa29
5 Mm Hs CELA3A Acinar cells ELA3|ELA3A
6 Mm Hs PRSS1 Acinar cells TRY1
ubiquitousness_index
1 0.017
2 0.013
3 0.001
4 0.001
5 0.001
6 0.002
product_description
1 chymotrypsinogen B1
2 kallikrein 1
3 recombination signal binding protein for immunoglobulin kappa J region like
4 pancreas associated transcription factor 1a
5 chymotrypsin like elastase family member 3A
6 serine protease 1
gene_type canonical_marker germ_layer organ
1 protein-coding gene 1 Endoderm Pancreas
2 protein-coding gene 1 Endoderm Pancreas
3 protein-coding gene 1 Endoderm Pancreas
4 protein-coding gene 1 Endoderm Pancreas
5 protein-coding gene 1 Endoderm Pancreas
6 protein-coding gene 1 Endoderm Pancreas
sensitivity_human sensitivity_mouse specificity_human
1 1.0 0.957143 0.000628931
2 0.833333 0.314286 0.00503145
3 0.0 0.0 0.0
4 0.0 0.157143 0.000628931
5 0.833333 0.128571 0.0
6 1.0 0.0285714 0.00597484
specificity_mouse
1 0.0159201
2 0.0128263
3 0.0
4 0.000773445
5 0.0
6 0.0
构建gene set list:
panglao_pancreas <- df %>% group_by(`cell_type`) %>% summarise(geneset = list(`official_gene_symbol`))
pancreas_gs <- setNames(panglao_pancreas$geneset, panglao_pancreas$`cell_type`)
names(pancreas_gs)
[1] "Acinar cells" "Alpha cells"
[3] "Beta cells" "Delta cells"
[5] "Ductal cells" "Epsilon cells"
[7] "Gamma (PP) cells" "Pancreatic progenitor cells"
[9] "Pancreatic stellate cells" "Peri-islet Schwann cells"